# importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import math as mth
from plotly import graph_objects as go
from scipy import stats as st
from datetime import datetime, timedelta
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage
# suppressing warnings
import warnings
warnings.filterwarnings('ignore')
# reading csv files and assigning the result to variables
source = pd.read_csv('https://code.s3.yandex.net/datasets/mobile_soures.csv')
dataset = pd.read_csv('https://code.s3.yandex.net/datasets/mobile_dataset.csv')
Source
# displaying the head of the DataFrame
source.head(10)
| userId | source | |
|---|---|---|
| 0 | 020292ab-89bc-4156-9acf-68bc2783f894 | other |
| 1 | cf7eda61-9349-469f-ac27-e5b6f5ec475c | yandex |
| 2 | 8c356c42-3ba9-4cb6-80b8-3f868d0192c3 | yandex |
| 3 | d9b06b47-0f36-419b-bbb0-3533e582a6cb | other |
| 4 | f32e1e2a-3027-4693-b793-b7b3ff274439 | |
| 5 | 17f6b2db-2964-4d11-89d8-7e38d2cb4750 | yandex |
| 6 | 62aa104f-592d-4ccb-8226-2ba0e719ded5 | yandex |
| 7 | 57321726-5d66-4d51-84f4-c797c35dcf2b | |
| 8 | c2cf55c0-95f7-4269-896c-931d14deaab5 | |
| 9 | 48e614d6-fe03-40f7-bf9e-4c4f61c19f64 | yandex |
# printing information about the DataFrame
source.info()
# counting gaps
source.isnull().sum()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4293 entries, 0 to 4292 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 userId 4293 non-null object 1 source 4293 non-null object dtypes: object(2) memory usage: 67.2+ KB
userId 0 source 0 dtype: int64
# checking for values in all columns
source['source'].value_counts()
yandex 1934 other 1230 google 1129 Name: source, dtype: int64
There are 3 sources.
# checking for duplicates
source.duplicated().sum()
0
There are no duplicates
Dataset
# displaying the head of the DataFrame
dataset.head(10)
| event.time | event.name | user.id | |
|---|---|---|---|
| 0 | 2019-10-07 00:00:00.431357 | advert_open | 020292ab-89bc-4156-9acf-68bc2783f894 |
| 1 | 2019-10-07 00:00:01.236320 | tips_show | 020292ab-89bc-4156-9acf-68bc2783f894 |
| 2 | 2019-10-07 00:00:02.245341 | tips_show | cf7eda61-9349-469f-ac27-e5b6f5ec475c |
| 3 | 2019-10-07 00:00:07.039334 | tips_show | 020292ab-89bc-4156-9acf-68bc2783f894 |
| 4 | 2019-10-07 00:00:56.319813 | advert_open | cf7eda61-9349-469f-ac27-e5b6f5ec475c |
| 5 | 2019-10-07 00:01:19.993624 | tips_show | cf7eda61-9349-469f-ac27-e5b6f5ec475c |
| 6 | 2019-10-07 00:01:27.770232 | advert_open | 020292ab-89bc-4156-9acf-68bc2783f894 |
| 7 | 2019-10-07 00:01:34.804591 | tips_show | 020292ab-89bc-4156-9acf-68bc2783f894 |
| 8 | 2019-10-07 00:01:49.732803 | advert_open | cf7eda61-9349-469f-ac27-e5b6f5ec475c |
| 9 | 2019-10-07 00:01:54.958298 | advert_open | 020292ab-89bc-4156-9acf-68bc2783f894 |
# printing information about the DataFrame
dataset.info()
# counting gaps
dataset.isnull().sum()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 74197 entries, 0 to 74196 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 event.time 74197 non-null object 1 event.name 74197 non-null object 2 user.id 74197 non-null object dtypes: object(3) memory usage: 1.7+ MB
event.time 0 event.name 0 user.id 0 dtype: int64
# checking for values in all columns
dataset['event.name'].value_counts()
tips_show 40055 photos_show 10012 advert_open 6164 contacts_show 4450 map 3881 search_1 3506 favorites_add 1417 search_5 1049 tips_click 814 search_4 701 contacts_call 541 search_3 522 search_6 460 search_2 324 search_7 222 show_contacts 79 Name: event.name, dtype: int64
# checking for duplicates
dataset.duplicated().sum()
0
There are no duplicates
Conclusion:
Source:
Dataset:
# renaming columns
source.columns = ['user', 'source']
dataset.columns = ['time', 'event', 'user']
# changing data types
dataset['time'] = pd.to_datetime(dataset['time'])
# creating a date column
dataset['date'] = pd.to_datetime(dataset['time'].dt.date)
# merging search_1 - search_7 into search
search_all = {'search_1', 'search_2', 'search_3', 'search_4', 'search_5', 'search_6', 'search_7'}
for i in range(len(dataset['event'])):
if dataset['event'][i] in search_all:
dataset['event'][i] = 'search'
# checking if merging was correct
dataset.query('event == "search"')
| time | event | user | date | |
|---|---|---|---|---|
| 25 | 2019-10-07 00:05:20.190956 | search | d9b06b47-0f36-419b-bbb0-3533e582a6cb | 2019-10-07 |
| 26 | 2019-10-07 00:05:23.427766 | search | d9b06b47-0f36-419b-bbb0-3533e582a6cb | 2019-10-07 |
| 193 | 2019-10-07 08:10:01.339216 | search | 48e614d6-fe03-40f7-bf9e-4c4f61c19f64 | 2019-10-07 |
| 194 | 2019-10-07 08:10:25.295570 | search | 48e614d6-fe03-40f7-bf9e-4c4f61c19f64 | 2019-10-07 |
| 215 | 2019-10-07 09:12:06.100794 | search | af9f4d75-530c-40f9-9aab-a5535b84845e | 2019-10-07 |
| ... | ... | ... | ... | ... |
| 74089 | 2019-11-03 22:58:22.080153 | search | 5ca37f56-acd1-4bf7-abe9-7a299f085ffd | 2019-11-03 |
| 74091 | 2019-11-03 22:58:34.937794 | search | 5ca37f56-acd1-4bf7-abe9-7a299f085ffd | 2019-11-03 |
| 74114 | 2019-11-03 23:09:00.969428 | search | 28fccdf4-7b9e-42f5-bc73-439a265f20e9 | 2019-11-03 |
| 74115 | 2019-11-03 23:09:06.310431 | search | 28fccdf4-7b9e-42f5-bc73-439a265f20e9 | 2019-11-03 |
| 74194 | 2019-11-03 23:56:57.041825 | search | 20850c8f-4135-4059-b13b-198d3ac59902 | 2019-11-03 |
6784 rows × 4 columns
# renaming events
dataset.loc[(dataset.query('event == "show_contacts"').index),'event'] = "contacts_show"
# checking for values in all columns
dataset['event'].value_counts()
tips_show 40055 photos_show 10012 search 6784 advert_open 6164 contacts_show 4529 map 3881 favorites_add 1417 tips_click 814 contacts_call 541 Name: event, dtype: int64
# checking for duplicates
dataset.duplicated().sum()
0
# checking for duplicates
source.duplicated().sum()
0
Conclusion:
# checking the sample of the table
source.sample(3)
| user | source | |
|---|---|---|
| 2486 | 01b4ca51-930d-4518-aa09-8a8c35e1d9cc | |
| 1597 | 1a717649-1f48-4c68-a5e6-09d9dd9bd150 | |
| 3107 | d64423dd-22d5-4903-bfb4-cf60a1fb410f |
# counting the number of rows and unique users
print(f'Total rows - {len(source)} \nUnique users - {source["user"].nunique()}')
Total rows - 4293 Unique users - 4293
All users in the source table are unique.
Now you can evaluate the sources in terms of user acquisition.
# evaluate the sources in terms of user acquisition
users_by_source = source.groupby('source').agg({'user':'nunique'})
users_by_source.columns = ['unique_users']
users_by_source['percent_unique_users'] = round(users_by_source['unique_users'] / users_by_source['unique_users'].sum(),4)*100
users_by_source.sort_values(by='percent_unique_users', ascending=False)
| unique_users | percent_unique_users | |
|---|---|---|
| source | ||
| yandex | 1934 | 45.05 |
| other | 1230 | 28.65 |
| 1129 | 26.30 |
Among the top sources of traffic, Yandex ranks first, followed by Other (other sources), and Google in third place.
It appears that more money is spent on buying traffic in Yandex than in Google. It is important to check whether this traffic pays off. If the performance of traffic from Google is not worse, then the redistribution of budgets can be considered.
# checking the sample of the table
dataset.sample(3)
| time | event | user | date | |
|---|---|---|---|---|
| 37399 | 2019-10-22 10:11:48.760869 | search | 4f3e9680-e104-402b-958a-74207311966a | 2019-10-22 |
| 28282 | 2019-10-18 14:22:23.474824 | tips_show | addeebe6-3778-4a6d-8792-02b6dfbf61aa | 2019-10-18 |
| 11538 | 2019-10-12 10:20:19.298076 | tips_show | e922846e-28b0-4a3f-816e-70818a93d2c0 | 2019-10-12 |
# counting the number of rows and unique users
total_rows = len(dataset)
unique_users = dataset['user'].nunique()
print(f'Total rows - {total_rows} \nUnique users - {unique_users}')
Total rows - 74197 Unique users - 4293
The number of unique users is the same as in the source table.
# calculating the average number of user events per day
print('Average number of user events per day -', round(total_rows/unique_users, 1))
print('Median number of user events per day -', round(dataset.groupby('user')['event'].count().median(), 1))
Average number of user events per day - 17.3 Median number of user events per day - 9.0
# checking the minimum date
min_date = dataset['time'].min()
# checking the maximum date
max_date = dataset['time'].max()
# checking the timedelta
time_delta = max_date - min_date
print('Minimum date: {}'.format(min_date))
print('Maximum date: {}'.format(max_date))
print('Timedelta: {}'.format(time_delta))
Minimum date: 2019-10-07 00:00:00.431357 Maximum date: 2019-11-03 23:58:12.532487 Timedelta: 27 days 23:58:12.101130
Data is available from October 7 to November 3, 2019. There is almost a 28-day time difference.
Let's see if there are very active users, as their actions may distort further analysis.
# counting the number of events per user
events_per_user = dataset.pivot_table(index='user', aggfunc={'event':'count'})
events_per_user.columns = ['event_count']
events_per_user.sort_values(by='event_count', ascending = False)
| event_count | |
|---|---|
| user | |
| cb36854f-570a-41f4-baa8-36680b396370 | 478 |
| e13f9f32-7ae3-4204-8d60-898db040bcfc | 465 |
| 9ce63488-758a-481a-bcb5-a02b467e1d84 | 407 |
| 21230dd9-2f7f-4b77-a436-43d4d10388e0 | 400 |
| be1449f6-ca45-4f94-93a7-ea4b079b8f0f | 397 |
| ... | ... |
| 55f7dc86-a03b-4cff-adc6-61a23d4ab680 | 1 |
| 800920e6-c654-434c-966e-8e3ebcee0d38 | 1 |
| b3aa5ec3-6e23-4fcf-bdcc-c0527228fd47 | 1 |
| 6a2e7783-a775-4c7d-adf0-22d1983ce9bd | 1 |
| 8bc67b38-7e6d-4084-b354-58b30d8f1e6d | 1 |
4293 rows × 1 columns
# computing a summary of statistics
events_per_user.describe()
| event_count | |
|---|---|
| count | 4293.000000 |
| mean | 17.283252 |
| std | 29.130677 |
| min | 1.000000 |
| 25% | 5.000000 |
| 50% | 9.000000 |
| 75% | 17.000000 |
| max | 478.000000 |
# plotting the number of events per user
fig = go.Figure()
fig.add_trace(go.Scatter(y=events_per_user['event_count'], mode='markers'))
fig.update_layout(title='Distribution of events per user',
xaxis_title='Users',
yaxis_title='Number of events',
margin=dict(l=0, r=0))
fig.update_traces(hoverinfo="all", hovertemplate='Number of events: %{y}')
fig.show()
Total events 4293, average events per user 17.3, median 9, maximum events per user 478.
There are anomalously active users present. Let's look at the percentiles to figure out where the anomaly borders.
# identifying 50, 75, 5, 1 percentiles
def percentile(df, column):
seventy_five = np.percentile(df[column], [75])[0]
fifty = np.percentile(df[column], [50])[0]
five = np.percentile(df[column], [95])[0]
one = np.percentile(df[column], [99])[0]
print(
'No more than 75% of the data is greater than the value {:.2f}\n'
'No more than 50% of the data is greater than the value {:.2f}\n'
'No more than 5% of the data is greater than the value {:.2f}\n'
'No more than 1% of the data is greater than the value {:.2f}'.format(seventy_five, fifty, five, one))
percentile(events_per_user, 'event_count')
No more than 75% of the data is greater than the value 17.00 No more than 50% of the data is greater than the value 9.00 No more than 5% of the data is greater than the value 59.00 No more than 1% of the data is greater than the value 132.00
As can be seen from the graph, users performing more than 100 actions are rather an exception. After calculations, only 1% of users perform more than 132 actions in the application. Let's remove them from further analysis.
In addition, users who have completed one action will not be useful for our study.
# counting users who have completed one action
events_per_user[events_per_user ['event_count'] == 1].count()
event_count 65 dtype: int64
There aren't many of these users. Let's delete them.
# saving users with less than 132 and more than 1 events
events_per_user2 = (events_per_user[(events_per_user['event_count'] > 1) & (events_per_user['event_count'] < 132)])
events_per_user2 = list(events_per_user2.index)
# checking the length of the dataframe
len(events_per_user2)
4184
# merging 2 tables into one
df = source.merge(dataset, on = 'user', how = 'left').sort_values(by='time')
# removing users with more than 132 events from df
df = df[df['user'].isin(events_per_user2)].copy()
# counting how many users have been deleted
print('Number of deleted users: {}\n'
'Remaining unique users: {}'.format(dataset["user"].nunique()-df['user'].nunique(), df['user'].nunique()))
Number of deleted users: 109 Remaining unique users: 4184
A total of 109 users have been removed for performing one or more than 132 within the application.
The next step is to calculate the distribution of events by day.
# calculate the distribution of events by day
event_grouped = dataset.groupby(['date', 'event'])['time'].count().reset_index()
event_grouped
| date | event | time | |
|---|---|---|---|
| 0 | 2019-10-07 | advert_open | 401 |
| 1 | 2019-10-07 | contacts_call | 7 |
| 2 | 2019-10-07 | contacts_show | 61 |
| 3 | 2019-10-07 | favorites_add | 40 |
| 4 | 2019-10-07 | map | 168 |
| ... | ... | ... | ... |
| 247 | 2019-11-03 | map | 65 |
| 248 | 2019-11-03 | photos_show | 554 |
| 249 | 2019-11-03 | search | 279 |
| 250 | 2019-11-03 | tips_click | 28 |
| 251 | 2019-11-03 | tips_show | 1382 |
252 rows × 3 columns
# plotting a graph
fig = px.bar(event_grouped, x='date', y='time', color='event')
fig.update_layout(title_text='The distribution of events by day')
fig.show()
Since we see very few tips_click events, the vast majority of tips_show events (the user saw recommended ads) are automatic and do not indicate interaction with the mobile application. Let's try to visualize this without tips_show.
# plotting a graph without tips_show
fig1 = px.bar(event_grouped.query('event !="tips_show"'), x='date', y='time', color='event')
fig1.update_layout(title_text='The distribution of events by day without tips_show')
fig1.show()
# identifying what types of events there are and how often they occur
df.groupby('event').agg({'user': 'count'}).sort_values(by='user', ascending = False)
| user | |
|---|---|
| event | |
| tips_show | 34118 |
| photos_show | 9364 |
| search | 6434 |
| advert_open | 4972 |
| contacts_show | 3646 |
| map | 3217 |
| favorites_add | 1222 |
| tips_click | 719 |
| contacts_call | 537 |
Tips_show is the most frequent event.
Calculating how many unique users completed each of these events. Sorting events by the number of unique users. Calculating the proportion of unique users who completed at least one event.
# calculating how many unique users completed each of these events,
# as well as the proportion of unique users who completed at least one event
# sorting events by the number of unique users
event_user_count = df.groupby('event').agg({'user': 'nunique'}).sort_values(by='user', ascending = False)
event_user_count['percent'] = event_user_count['user'] / df['user'].nunique()
event_user_count.style.format({'percent': '{:,.2%}'})
| user | percent | |
|---|---|---|
| event | ||
| tips_show | 2731 | 65.27% |
| search | 1632 | 39.01% |
| map | 1404 | 33.56% |
| photos_show | 1086 | 25.96% |
| contacts_show | 949 | 22.68% |
| advert_open | 722 | 17.26% |
| favorites_add | 338 | 8.08% |
| tips_click | 306 | 7.31% |
| contacts_call | 212 | 5.07% |
Tips_show was completed by the most users.
Visualizing funnel of sales where the site search is used to determine what percentage of unique users go to the next step.
Let's form a dataframe in which tips_show users will be removed. Let's leave only search, advert_open, photos_show, and contacts_show cards.
# creating the tips_show dataframe with event = tips_show
tips_show = df.query('event == "tips_show"')
# creating the tips_show_users dataframe with unique users from tips_show
tips_show_users = tips_show['user'].unique().tolist()
# creating the search dataframe where event = search
search = df.query('event == "search"')
# creating the search_users dataframe with unique users from search
search_users = search['user'].unique().tolist()
# creating the search_funnel dataframe without users from tips_show
search_funnel = df.query('user not in @tips_show_users')
# saving data with users from search_users
search_funnel = search_funnel.query('user in @search_users')
# saving data with search, advert_open, photos_show, contacts_show cards
search_funnel = search_funnel.query('event == "advert_open" or event == "photos_show" or event == "contacts_show" or event == "search"')
# counting users
search_funnel_count = search_funnel.groupby('event').agg({'user': 'nunique'}).reindex(['search', 'advert_open', 'photos_show', 'contacts_show'])
# plotting funnel of sales
fig = go.Figure()
fig.add_trace(go.Funnel(
y = search_funnel_count.index,
x = search_funnel_count['user'],
textposition = "auto",
textinfo = "value+percent initial+percent previous"
))
fig.update_layout(title='Funnel of sales with users who came from search engine')
fig.show()
When building a funnel of sales with search, advert_open, photos_show, contacts_show cards, we found out that:
It takes a lot of time to search and only 8% of users open ads. Users probably have the choice to open the ad or the photos right away. 74% of users open photos immediately + 8% of users open ads = 82%. 27% of initial users view contacts.
Visualizing funnel of sales where the tips_show is used to determine what percentage of unique users go to the next step.
# creating the tips_show_funnel dataframe without users from search_users
tips_show_funnel = df.query('user not in @search_users')
# saving data with users from tips_show_users
tips_show_funnel = tips_show_funnel.query('user in @tips_show_users')
# saving data with tips_show, tips_click, favorites_add, contacts_show cards
tips_show_funnel = tips_show_funnel.query('event =="tips_click" or event == "contacts_show" or event =="tips_show" or event =="favorites_add"')
# counting users
tips_show_funnel_count = tips_show_funnel.groupby('event').agg({'user': 'nunique'}).reindex(['tips_show', 'tips_click', 'favorites_add', 'contacts_show'])
# plotting the funnel of sales
fig = go.Figure()
fig.add_trace(go.Funnel(
y = tips_show_funnel_count.index,
x = tips_show_funnel_count['user'],
textposition = "auto",
textinfo = "value+percent initial+percent previous"
))
fig.update_layout(title='Funnel of sales with users who came from recommender systems')
fig.show()
When building a funnel of sales with tips_show, tips_click, favorites_add, contacts_show cards, we found out that:
The distorted shape of the funnel tells us that there are problems in the processes. Many users enter the funnel, but only a few make it to the next stage. This indicates a marketing error - showing the wrong ads to the wrong audience. There is a need to improve the ad serving system, since only 18% of contacts are viewed.
# checking each user's first and last application usage dates
first_visit = df.groupby(['user'])['date'].min().reset_index()
first_visit.columns = ['user', 'first_visit']
last_visit = df.groupby(['user'])['date'].max().reset_index()
last_visit.columns = ['user', 'last_visit']
visit = first_visit.merge(last_visit, on='user')
visit.head()
| user | first_visit | last_visit | |
|---|---|---|---|
| 0 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | 2019-10-07 | 2019-10-22 |
| 1 | 00157779-810c-4498-9e05-a1e9e3cedf93 | 2019-10-19 | 2019-11-03 |
| 2 | 00463033-5717-4bf1-91b4-09183923b9df | 2019-11-01 | 2019-11-01 |
| 3 | 004690c3-5a84-4bb7-a8af-e0c8f8fca64e | 2019-10-18 | 2019-10-31 |
| 4 | 00551e79-152e-4441-9cf7-565d7eb04090 | 2019-10-25 | 2019-10-29 |
# counting the number of days each user has used the application
visit['lifetime'] = (visit['last_visit'] - visit['first_visit']).dt.days
visit.head()
| user | first_visit | last_visit | lifetime | |
|---|---|---|---|---|
| 0 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | 2019-10-07 | 2019-10-22 | 15 |
| 1 | 00157779-810c-4498-9e05-a1e9e3cedf93 | 2019-10-19 | 2019-11-03 | 15 |
| 2 | 00463033-5717-4bf1-91b4-09183923b9df | 2019-11-01 | 2019-11-01 | 0 |
| 3 | 004690c3-5a84-4bb7-a8af-e0c8f8fca64e | 2019-10-18 | 2019-10-31 | 13 |
| 4 | 00551e79-152e-4441-9cf7-565d7eb04090 | 2019-10-25 | 2019-10-29 | 4 |
If the user's first and last visit fall on the same day, then the difference will be 0. Let's add one to find out how many days the application was used.
# adding 1 to find out how many days the app was used
visit['lifetime'] = visit['lifetime'] + 1
# creating a separate table for total days
user_days = visit.drop(['first_visit', 'last_visit'], axis=1)
user_days.head()
| user | lifetime | |
|---|---|---|
| 0 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | 16 |
| 1 | 00157779-810c-4498-9e05-a1e9e3cedf93 | 16 |
| 2 | 00463033-5717-4bf1-91b4-09183923b9df | 1 |
| 3 | 004690c3-5a84-4bb7-a8af-e0c8f8fca64e | 14 |
| 4 | 00551e79-152e-4441-9cf7-565d7eb04090 | 5 |
# analyzing the distribution of users by the number of days they have used the app
user_days.groupby('lifetime').agg({'user': 'count'}).sort_values(by='user', ascending = False)
| user | |
|---|---|
| lifetime | |
| 1 | 2691 |
| 2 | 299 |
| 3 | 145 |
| 4 | 144 |
| 6 | 101 |
| 8 | 99 |
| 5 | 98 |
| 9 | 74 |
| 7 | 64 |
| 10 | 59 |
| 14 | 45 |
| 11 | 43 |
| 16 | 42 |
| 12 | 38 |
| 18 | 34 |
| 15 | 31 |
| 13 | 31 |
| 17 | 21 |
| 23 | 20 |
| 19 | 18 |
| 20 | 18 |
| 22 | 18 |
| 21 | 15 |
| 26 | 12 |
| 24 | 10 |
| 27 | 7 |
| 25 | 5 |
| 28 | 2 |
# plotting a graph
user_days['lifetime'].hist(bins=28)
plt.title('Distribution of users by the number of days they have used the app')
user_days.groupby('lifetime').agg({'user': 'count'}).sort_values(by='user', ascending = False)
plt.xlabel('Days')
plt.ylabel('Users')
plt.show()
During the selected period, most users used the application for only one day.
# merging total days with df
df = df.merge(user_days, on='user')
df.head()
| user | source | time | event | date | lifetime | |
|---|---|---|---|---|---|---|
| 0 | 020292ab-89bc-4156-9acf-68bc2783f894 | other | 2019-10-07 00:00:00.431357 | advert_open | 2019-10-07 | 1 |
| 1 | 020292ab-89bc-4156-9acf-68bc2783f894 | other | 2019-10-07 00:00:01.236320 | tips_show | 2019-10-07 | 1 |
| 2 | 020292ab-89bc-4156-9acf-68bc2783f894 | other | 2019-10-07 00:00:07.039334 | tips_show | 2019-10-07 | 1 |
| 3 | 020292ab-89bc-4156-9acf-68bc2783f894 | other | 2019-10-07 00:01:27.770232 | advert_open | 2019-10-07 | 1 |
| 4 | 020292ab-89bc-4156-9acf-68bc2783f894 | other | 2019-10-07 00:01:34.804591 | tips_show | 2019-10-07 | 1 |
Let's see how long users used the app and what events were the first to occur.
# calculating the difference between the time of current event and the time of the previous event
df = df.sort_values(['user', 'time']).reset_index(drop=True)
df['id'] = df.index
df['diff'] = df.groupby('user')['time'].diff(1)
df.head()
| user | source | time | event | date | lifetime | id | diff | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | other | 2019-10-07 13:39:45.989359 | tips_show | 2019-10-07 | 16 | 0 | NaT |
| 1 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | other | 2019-10-07 13:40:31.052909 | tips_show | 2019-10-07 | 16 | 1 | 0 days 00:00:45.063550 |
| 2 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | other | 2019-10-07 13:41:05.722489 | tips_show | 2019-10-07 | 16 | 2 | 0 days 00:00:34.669580 |
| 3 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | other | 2019-10-07 13:43:20.735461 | tips_show | 2019-10-07 | 16 | 3 | 0 days 00:02:15.012972 |
| 4 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | other | 2019-10-07 13:45:30.917502 | tips_show | 2019-10-07 | 16 | 4 | 0 days 00:02:10.182041 |
Let's create sessions_start dataframe, which will contain the first session events, those that occurred more than 30 minutes after the previous one, or the events that were the first for the user. Also, let's create a column that will contain the iD of the first session event.
# creating sessions_start dataframe with first session events
sessions_start = df[(df['diff'].isnull()) | (df['diff'] > '1800 seconds')]
sessions_start['session_id'] = sessions_start['id']
sessions_start.head()
| user | source | time | event | date | lifetime | id | diff | session_id | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | other | 2019-10-07 13:39:45.989359 | tips_show | 2019-10-07 | 16 | 0 | NaT | 0 |
| 9 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | other | 2019-10-09 18:33:55.577963 | map | 2019-10-09 | 16 | 9 | 2 days 04:44:13.861346 | 9 |
| 13 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | other | 2019-10-21 19:52:30.778932 | tips_show | 2019-10-21 | 16 | 13 | 12 days 01:10:07.814984 | 13 |
| 27 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | other | 2019-10-22 11:18:14.635436 | map | 2019-10-22 | 16 | 27 | 0 days 15:10:44.584408 | 27 |
| 35 | 00157779-810c-4498-9e05-a1e9e3cedf93 | yandex | 2019-10-19 21:34:33.849769 | search | 2019-10-19 | 16 | 35 | NaT | 35 |
# merging tables based on closest key matches
df = df.sort_values('id')
sessions_start = sessions_start.sort_values('id')
df = pd.merge_asof(df,sessions_start[['id','user','session_id']],on='id',by='user')
df = df.sort_values(['user','time'])
df.head()
| user | source | time | event | date | lifetime | id | diff | session_id | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | other | 2019-10-07 13:39:45.989359 | tips_show | 2019-10-07 | 16 | 0 | NaT | 0 |
| 1 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | other | 2019-10-07 13:40:31.052909 | tips_show | 2019-10-07 | 16 | 1 | 0 days 00:00:45.063550 | 0 |
| 2 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | other | 2019-10-07 13:41:05.722489 | tips_show | 2019-10-07 | 16 | 2 | 0 days 00:00:34.669580 | 0 |
| 3 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | other | 2019-10-07 13:43:20.735461 | tips_show | 2019-10-07 | 16 | 3 | 0 days 00:02:15.012972 | 0 |
| 4 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | other | 2019-10-07 13:45:30.917502 | tips_show | 2019-10-07 | 16 | 4 | 0 days 00:02:10.182041 | 0 |
# identifying the events in sessions that were the first
df['is_first_event_in_session'] = df['id'] == df['session_id']
df.head()
| user | source | time | event | date | lifetime | id | diff | session_id | is_first_event_in_session | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | other | 2019-10-07 13:39:45.989359 | tips_show | 2019-10-07 | 16 | 0 | NaT | 0 | True |
| 1 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | other | 2019-10-07 13:40:31.052909 | tips_show | 2019-10-07 | 16 | 1 | 0 days 00:00:45.063550 | 0 | False |
| 2 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | other | 2019-10-07 13:41:05.722489 | tips_show | 2019-10-07 | 16 | 2 | 0 days 00:00:34.669580 | 0 | False |
| 3 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | other | 2019-10-07 13:43:20.735461 | tips_show | 2019-10-07 | 16 | 3 | 0 days 00:02:15.012972 | 0 | False |
| 4 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | other | 2019-10-07 13:45:30.917502 | tips_show | 2019-10-07 | 16 | 4 | 0 days 00:02:10.182041 | 0 | False |
# counting the number of events in a session
n_event = df.groupby(['user', 'session_id'])['event'].count().reset_index()
n_event.columns = ['user', 'session_id', 'n_events']
n_event
| user | session_id | n_events | |
|---|---|---|---|
| 0 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | 0 | 9 |
| 1 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | 9 | 4 |
| 2 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | 13 | 14 |
| 3 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | 27 | 8 |
| 4 | 00157779-810c-4498-9e05-a1e9e3cedf93 | 35 | 9 |
| ... | ... | ... | ... |
| 9423 | ffc01466-fdb1-4460-ae94-e800f52eb136 | 64207 | 7 |
| 9424 | ffcf50d9-293c-4254-8243-4890b030b238 | 64214 | 2 |
| 9425 | ffe68f10-e48e-470e-be9b-eeb93128ff1a | 64216 | 3 |
| 9426 | ffe68f10-e48e-470e-be9b-eeb93128ff1a | 64219 | 3 |
| 9427 | ffe68f10-e48e-470e-be9b-eeb93128ff1a | 64222 | 7 |
9428 rows × 3 columns
# plotting a graph
n_event['n_events'].hist(bins=60)
plt.title('Distribution of users by the number of events in a session')
plt.xlabel('Events per session')
plt.ylabel('Users')
plt.show()
# plotting a graph
n_event['n_events'].hist(bins=20, range=(0, 30))
plt.title('Distribution of users by the number of events in a session')
plt.xlabel('Events per session')
plt.ylabel('Users')
plt.show()
# counting the number of sessions per user
cnt_session = n_event.groupby('user')['session_id'].count()
print('Average number of sessions per userя: {:.2f}'.format(cnt_session.mean()))
print('Average number of events in 1 session: {:.2f}'.format(n_event['n_events'].mean()))
Average number of sessions per userя: 2.25 Average number of events in 1 session: 6.81
# identifying sessions with the most events
n_event.sort_values(by='n_events', ascending=False)
| user | session_id | n_events | |
|---|---|---|---|
| 9021 | f5d919ef-0afb-4b6f-958c-8c840aea7852 | 61544 | 104 |
| 8510 | e549f8ef-653b-4c5c-a6bd-8970e6bd860b | 57800 | 104 |
| 269 | 07a043c4-8168-4691-88ce-7b3c563432d2 | 2000 | 97 |
| 4251 | 74041397-d70d-49ab-8d8e-7f9d00f2ffea | 28543 | 91 |
| 792 | 13140930-df18-4793-a230-7cca5c8813db | 5469 | 86 |
| ... | ... | ... | ... |
| 1276 | 1fc5b6cf-6e2f-4608-b136-c10e5d8ca105 | 9167 | 1 |
| 6694 | b6cbe96a-842f-4c96-97d6-d8565eed7a5d | 45697 | 1 |
| 7922 | d6072f5d-8c19-44f7-95df-7eb7f4a5ecf0 | 53861 | 1 |
| 713 | 114ba847-9a89-49a9-af35-3825c241481a | 5098 | 1 |
| 826 | 13b142e5-6f36-4512-93e9-72bdb3ff6d7c | 5799 | 1 |
9428 rows × 3 columns
# checking events in the session 61544
df.query('session_id == 61544')['event'].value_counts()
tips_show 46 advert_open 43 contacts_show 7 map 5 search 3 Name: event, dtype: int64
# checking events for user bd74ba4a-2531-4b4c-950a-84ea2bd7e0ac
df.query('user == "e549f8ef-653b-4c5c-a6bd-8970e6bd860b"')['event'].value_counts()
tips_show 79 contacts_show 29 search 4 Name: event, dtype: int64
Each user has on average 2 sessions. There are almost 7 events per session, ranging from 1 when users exit the application to 104 when they actively search.
# identifying the events that started the sessions
df.query('is_first_event_in_session == True')['event'].value_counts()
tips_show 3273 search 2354 photos_show 1607 map 1180 contacts_show 506 advert_open 361 favorites_add 105 tips_click 42 Name: event, dtype: int64
Since users return from the background, any event becomes the start of the session.
Let's create a table of users that contains the date of the first event and the date of the first target event.
# creating a table of users that contains the date of the first event
first_event = df.groupby(['user', 'session_id'])['time'].min().reset_index()
first_event.head()
| user | session_id | time | |
|---|---|---|---|
| 0 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | 0 | 2019-10-07 13:39:45.989359 |
| 1 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | 9 | 2019-10-09 18:33:55.577963 |
| 2 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | 13 | 2019-10-21 19:52:30.778932 |
| 3 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | 27 | 2019-10-22 11:18:14.635436 |
| 4 | 00157779-810c-4498-9e05-a1e9e3cedf93 | 35 | 2019-10-19 21:34:33.849769 |
# creating a table of users that contains the date of the first target event
first_contacts_show = df.query('event == "contacts_show"').groupby(['user', 'session_id'])['time'].min().reset_index()
first_contacts_show.columns = ['user', 'session_id', 'time_first_contacts_show']
first_contacts_show.head()
| user | session_id | time_first_contacts_show | |
|---|---|---|---|
| 0 | 00157779-810c-4498-9e05-a1e9e3cedf93 | 44 | 2019-10-20 19:17:18.659799 |
| 1 | 00157779-810c-4498-9e05-a1e9e3cedf93 | 71 | 2019-10-29 21:26:40.258472 |
| 2 | 00157779-810c-4498-9e05-a1e9e3cedf93 | 91 | 2019-10-30 08:01:05.420773 |
| 3 | 00157779-810c-4498-9e05-a1e9e3cedf93 | 105 | 2019-11-03 17:12:09.708771 |
| 4 | 00551e79-152e-4441-9cf7-565d7eb04090 | 148 | 2019-10-25 16:44:41.263364 |
# adding a column with the name of the first event
def event(row):
add_event = (df.loc[(df['user']==row['user']) & (df['time']==row['time']),'event']).values
return add_event[0]
first_event['first_event'] = first_event.apply(event, axis=1)
first_event.head()
| user | session_id | time | first_event | |
|---|---|---|---|---|
| 0 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | 0 | 2019-10-07 13:39:45.989359 | tips_show |
| 1 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | 9 | 2019-10-09 18:33:55.577963 | map |
| 2 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | 13 | 2019-10-21 19:52:30.778932 | tips_show |
| 3 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | 27 | 2019-10-22 11:18:14.635436 | map |
| 4 | 00157779-810c-4498-9e05-a1e9e3cedf93 | 35 | 2019-10-19 21:34:33.849769 | search |
"Contacts_show" appeared in 1478 sessions, and there were 9428 sessions in total.
# merging both tables and calculating time delta
time_diff = first_contacts_show.merge(first_event, on=['user', 'session_id'], how='left')
time_diff.columns = ['user', 'session_id', 'time_first_contacts_show', 'time_first_event', 'first_event']
time_diff['time_delta'] = time_diff['time_first_contacts_show'] - time_diff['time_first_event']
time_diff.sort_values(by='time_delta').head()
| user | session_id | time_first_contacts_show | time_first_event | first_event | time_delta | |
|---|---|---|---|---|---|---|
| 1477 | ffe68f10-e48e-470e-be9b-eeb93128ff1a | 64219 | 2019-10-22 16:07:17.683553 | 2019-10-22 16:07:17.683553 | contacts_show | 0 days |
| 539 | 62a5375a-eb94-4ed2-90ef-3d79d8e0c359 | 24316 | 2019-10-12 15:30:48.136818 | 2019-10-12 15:30:48.136818 | contacts_show | 0 days |
| 540 | 62a5375a-eb94-4ed2-90ef-3d79d8e0c359 | 24319 | 2019-10-12 16:25:18.824383 | 2019-10-12 16:25:18.824383 | contacts_show | 0 days |
| 542 | 62a5375a-eb94-4ed2-90ef-3d79d8e0c359 | 24419 | 2019-10-25 19:20:53.639672 | 2019-10-25 19:20:53.639672 | contacts_show | 0 days |
| 543 | 62a5375a-eb94-4ed2-90ef-3d79d8e0c359 | 24427 | 2019-10-26 11:17:49.415812 | 2019-10-26 11:17:49.415812 | contacts_show | 0 days |
Several users with "time_delta" == 0 belong to the "contacts_show" event.
# checking if events are contacts_show where time_delta = '0 days 00:00:00'
time_diff.query('time_delta =="0 days 00:00:00"')['first_event'].value_counts()
contacts_show 506 Name: first_event, dtype: int64
With contacts_show, 506 sessions started.
# checking out the events that started the sessions
df.query('is_first_event_in_session == True')['event'].value_counts()
tips_show 3273 search 2354 photos_show 1607 map 1180 contacts_show 506 advert_open 361 favorites_add 105 tips_click 42 Name: event, dtype: int64
# counting the share of all contacts_show
round((506/len(time_diff)) * 100, 2)
34.24
Even though such sessions represent 34.24% of the total, we will discard them to avoid distorting the results.
# removing data with time_delta !="0 days 00:00:00"
time_diff = time_diff.query('time_delta !="0 days 00:00:00"').reset_index(drop=True)
time_diff.head()
| user | session_id | time_first_contacts_show | time_first_event | first_event | time_delta | |
|---|---|---|---|---|---|---|
| 0 | 00157779-810c-4498-9e05-a1e9e3cedf93 | 44 | 2019-10-20 19:17:18.659799 | 2019-10-20 18:49:24.115634 | search | 0 days 00:27:54.544165 |
| 1 | 00157779-810c-4498-9e05-a1e9e3cedf93 | 71 | 2019-10-29 21:26:40.258472 | 2019-10-29 21:18:24.850073 | search | 0 days 00:08:15.408399 |
| 2 | 00157779-810c-4498-9e05-a1e9e3cedf93 | 91 | 2019-10-30 08:01:05.420773 | 2019-10-30 07:50:45.948358 | search | 0 days 00:10:19.472415 |
| 3 | 00551e79-152e-4441-9cf7-565d7eb04090 | 150 | 2019-10-28 13:10:40.331441 | 2019-10-28 13:08:15.809056 | search | 0 days 00:02:24.522385 |
| 4 | 007d031d-5018-4e02-b7ee-72a30609173f | 206 | 2019-10-22 13:08:09.140381 | 2019-10-22 13:02:26.636223 | map | 0 days 00:05:42.504158 |
We have 972 sessions left.
# calculating the mean and median time between the first and target event for each user
grouped = time_diff.groupby('user')['time_delta']
mean_time_delta = grouped.apply(lambda x: np.mean(x))
median_time_delta = grouped.apply(lambda x: np.median(x))
# displaying on the screen
mean_time_delta.sort_values(ascending = False)
user
65480689-8db7-4211-aeba-5aec94d5d0d5 0 days 01:28:09.923892
543199e1-9439-4ad5-9752-d41f8925d446 0 days 01:14:47.563661
3bf1edba-5d7f-4eb2-911e-0a85d370a9c1 0 days 01:11:27.356971
62b719cf-659d-4bd7-9a40-db768791dea3 0 days 01:00:37.327980
2a45346e-cbdb-47b6-a652-b7d0d2e92391 0 days 00:57:49.879418
...
03bef3ef-cce8-46ed-8c70-414b6b0486fb 0 days 00:00:04.672536
462069d5-4ecd-410f-8075-975822219638 0 days 00:00:04.208288
7a4806d3-b727-43f7-9edc-d5068869d95d 0 days 00:00:02.125164
f553a25f-67a7-4c5b-ab08-9f56a388cdb4 0 days 00:00:00.294110
fbdabf5f-ea95-47d5-96b3-75e136336477 0 days 00:00:00.263470
Name: time_delta, Length: 724, dtype: timedelta64[ns]
# displaying on the screen
median_time_delta.sort_values(ascending = False)
user
65480689-8db7-4211-aeba-5aec94d5d0d5 0 days 01:28:09.923892
543199e1-9439-4ad5-9752-d41f8925d446 0 days 01:14:47.563661
3bf1edba-5d7f-4eb2-911e-0a85d370a9c1 0 days 01:11:27.356971
62b719cf-659d-4bd7-9a40-db768791dea3 0 days 01:00:37.327980
2a45346e-cbdb-47b6-a652-b7d0d2e92391 0 days 00:57:49.879418
...
03bef3ef-cce8-46ed-8c70-414b6b0486fb 0 days 00:00:04.672536
462069d5-4ecd-410f-8075-975822219638 0 days 00:00:04.208288
7a4806d3-b727-43f7-9edc-d5068869d95d 0 days 00:00:02.125164
f553a25f-67a7-4c5b-ab08-9f56a388cdb4 0 days 00:00:00.294110
fbdabf5f-ea95-47d5-96b3-75e136336477 0 days 00:00:00.263470
Name: time_delta, Length: 724, dtype: timedelta64[ns]
We calculated the mean and median time between the first and target event for 724 users.
# calculating the mean and median time between the first and target event in one session
print(f"Average time = {time_diff['time_delta'].mean()} \Median time = {time_diff['time_delta'].median()}")
Average time = 0 days 00:09:21.666576305 \Median time = 0 days 00:04:49.595856500
# computing a summary of statistics
time_diff['time_delta'].describe()
count 972 mean 0 days 00:09:21.666576305 std 0 days 00:12:15.320901456 min 0 days 00:00:00.158918 25% 0 days 00:01:25.489593750 50% 0 days 00:04:49.595856500 75% 0 days 00:11:53.551617250 max 0 days 01:38:05.551664 Name: time_delta, dtype: object
Check out the histogram for the distribution of time in minutes.
# plotting the distribution of time in minutes
minutes_delta = round(time_diff['time_delta'].dt.total_seconds()/60, 2)
minutes_delta.hist(figsize=(15,3), bins=60)
plt.title('Time between the first and target event in minutes per session')
plt.ylabel('Number of users')
plt.xlabel('Minutes');
plt.show()
minutes_delta.hist(figsize=(15,3), bins=60, range=(0,50))
plt.title('Time between the first and target event in minutes per session')
plt.ylabel('Number of users')
plt.xlabel('Minutes');
plt.show()
There is a maximum time of 0 days 01:38:05 between the first and target events, an average of 0 days 00:09:21, and a median of 0 days 00:04:49. Based on the histogram, the most values fall on the ~20 minutes.
Consider the behavior of people who made the contacts_show target event and those who did not.
# counting the number of users who made the contacts_show target event and those who did not
contacts_show_users = df.query('event=="contacts_show"')['user'].unique().tolist()
print("Number of users who made the target event (contacts_show) =", len(contacts_show_users))
all_users = df['user'].unique()
print("Number of all users =", len(all_users))
non_contacts_show_users = list( set(all_users) - set(contacts_show_users) )
print("The number of users who did not make the target event (contacts_show) =", len(non_contacts_show_users))
Number of users who made the target event (contacts_show) = 949 Number of all users = 4184 The number of users who did not make the target event (contacts_show) = 3235
# verifying if any of the users who did not make the target event called (contacts_call)
df.query('user==@non_contacts_show_users and event=="contacts_call"')
| user | source | time | event | date | lifetime | id | diff | session_id | is_first_event_in_session |
|---|
There are no such users.
# counting the number of user events that did not make the target event
non_contacts_show = df.query('user==@non_contacts_show_users')['event'].value_counts().reset_index()
non_contacts_show
| index | event | |
|---|---|---|
| 0 | tips_show | 25329 |
| 1 | photos_show | 6054 |
| 2 | search | 4577 |
| 3 | advert_open | 3901 |
| 4 | map | 2503 |
| 5 | favorites_add | 830 |
| 6 | tips_click | 473 |
# counting the number of user events that made the target event (calls will be deleted along with the target event)
contacts_show = df.query('user==@contacts_show_users')['event'].value_counts()
contacts_show = contacts_show.drop(['contacts_call','contacts_show']).reset_index()
contacts_show
| index | event | |
|---|---|---|
| 0 | tips_show | 8789 |
| 1 | photos_show | 3310 |
| 2 | search | 1857 |
| 3 | advert_open | 1071 |
| 4 | map | 714 |
| 5 | favorites_add | 392 |
| 6 | tips_click | 246 |
# comparing on the chart
plt.figure(figsize=(10,10))
plt.pie(non_contacts_show['event'], labels=non_contacts_show['index'], autopct='%1.1f%%')
plt.title("Behavior of users who didn't make contacts_show");
plt.show();
plt.figure(figsize=(10,10))
plt.pie(contacts_show['event'], labels=contacts_show['index'], autopct='%1.1f%%')
plt.title('Behavior of users who made contacts_show');
Who completed the target event:
# calculating the conversion of all events to contacts_show
non_contacts_show = df[df['event'] !='contacts_show']['user'].nunique()
contacts_show = df[df['event'] =='contacts_show']['user'].nunique()
print(f"Overall conversion into contacts_show = {round((contacts_show/non_contacts_show*100), 2)}%")
Overall conversion into contacts_show = 22.83%
Let's calculate conversions to the target event in the context of sessions using the time_diff table we created earlier.
# displaying the head of the DataFrame
time_diff.head(3)
| user | session_id | time_first_contacts_show | time_first_event | first_event | time_delta | |
|---|---|---|---|---|---|---|
| 0 | 00157779-810c-4498-9e05-a1e9e3cedf93 | 44 | 2019-10-20 19:17:18.659799 | 2019-10-20 18:49:24.115634 | search | 0 days 00:27:54.544165 |
| 1 | 00157779-810c-4498-9e05-a1e9e3cedf93 | 71 | 2019-10-29 21:26:40.258472 | 2019-10-29 21:18:24.850073 | search | 0 days 00:08:15.408399 |
| 2 | 00157779-810c-4498-9e05-a1e9e3cedf93 | 91 | 2019-10-30 08:01:05.420773 | 2019-10-30 07:50:45.948358 | search | 0 days 00:10:19.472415 |
# counting unique users who made contacts_show by grouping by first event
conversion = time_diff.groupby('first_event')['user'].nunique().reset_index()
conversion.columns = ['event', 'n_contacts_show']
conversion
| event | n_contacts_show | |
|---|---|---|
| 0 | advert_open | 25 |
| 1 | favorites_add | 17 |
| 2 | map | 127 |
| 3 | photos_show | 131 |
| 4 | search | 201 |
| 5 | tips_click | 3 |
| 6 | tips_show | 284 |
# counting unique users for each event
all_unique_users = df.groupby('event')['user'].nunique().reset_index()
all_unique_users.columns = ['event', 'unique_users']
all_unique_users
| event | unique_users | |
|---|---|---|
| 0 | advert_open | 722 |
| 1 | contacts_call | 212 |
| 2 | contacts_show | 949 |
| 3 | favorites_add | 338 |
| 4 | map | 1404 |
| 5 | photos_show | 1086 |
| 6 | search | 1632 |
| 7 | tips_click | 306 |
| 8 | tips_show | 2731 |
# merging tables
conversion = conversion.merge(all_unique_users, on='event', how='left')
# calculating the contacts_show conversion for each event
conversion['conversion_percent'] = round((conversion['n_contacts_show']/conversion['unique_users']*100), 2)
conversion.sort_values(by='conversion_percent', ascending = False)
| event | n_contacts_show | unique_users | conversion_percent | |
|---|---|---|---|---|
| 4 | search | 201 | 1632 | 12.32 |
| 3 | photos_show | 131 | 1086 | 12.06 |
| 6 | tips_show | 284 | 2731 | 10.40 |
| 2 | map | 127 | 1404 | 9.05 |
| 1 | favorites_add | 17 | 338 | 5.03 |
| 0 | advert_open | 25 | 722 | 3.46 |
| 5 | tips_click | 3 | 306 | 0.98 |
A high conversion rate is found in 'search' (12.32). Perhaps because users entered the application and were already looking for specific items. Followed by photos_show. Tips_click has the lowest conversion rate.
Identify the groups of users who return frequently to the mobile application (retention rate).
# displaying the date on which each user first used the application
first_visit
| user | first_visit | |
|---|---|---|
| 0 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | 2019-10-07 |
| 1 | 00157779-810c-4498-9e05-a1e9e3cedf93 | 2019-10-19 |
| 2 | 00463033-5717-4bf1-91b4-09183923b9df | 2019-11-01 |
| 3 | 004690c3-5a84-4bb7-a8af-e0c8f8fca64e | 2019-10-18 |
| 4 | 00551e79-152e-4441-9cf7-565d7eb04090 | 2019-10-25 |
| ... | ... | ... |
| 4179 | ff82c7c7-16d6-44b1-833b-a217747b0b02 | 2019-10-15 |
| 4180 | ffab8d8a-30bb-424a-a3ab-0b63ebbf7b07 | 2019-10-13 |
| 4181 | ffc01466-fdb1-4460-ae94-e800f52eb136 | 2019-10-07 |
| 4182 | ffcf50d9-293c-4254-8243-4890b030b238 | 2019-10-23 |
| 4183 | ffe68f10-e48e-470e-be9b-eeb93128ff1a | 2019-10-21 |
4184 rows × 2 columns
# merging df and first_visit
first_visit_events = df.merge(first_visit,on='user')
first_visit_events.head()
| user | source | time | event | date | lifetime | id | diff | session_id | is_first_event_in_session | first_visit | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | other | 2019-10-07 13:39:45.989359 | tips_show | 2019-10-07 | 16 | 0 | NaT | 0 | True | 2019-10-07 |
| 1 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | other | 2019-10-07 13:40:31.052909 | tips_show | 2019-10-07 | 16 | 1 | 0 days 00:00:45.063550 | 0 | False | 2019-10-07 |
| 2 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | other | 2019-10-07 13:41:05.722489 | tips_show | 2019-10-07 | 16 | 2 | 0 days 00:00:34.669580 | 0 | False | 2019-10-07 |
| 3 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | other | 2019-10-07 13:43:20.735461 | tips_show | 2019-10-07 | 16 | 3 | 0 days 00:02:15.012972 | 0 | False | 2019-10-07 |
| 4 | 0001b1d5-b74a-4cbf-aeb0-7df5947bf349 | other | 2019-10-07 13:45:30.917502 | tips_show | 2019-10-07 | 16 | 4 | 0 days 00:02:10.182041 | 0 | False | 2019-10-07 |
# identifying the week of the event
first_visit_events['event_week'] = pd.to_datetime(first_visit_events['date'],
unit='d') - pd.to_timedelta(first_visit_events['date'].dt.dayofweek, unit='d')
# identifying the week of the application's first use
first_visit_events['first_visit_week'] = pd.to_datetime(first_visit_events['first_visit'],
unit='d') - pd.to_timedelta(first_visit_events['first_visit'].dt.dayofweek, unit='d')
first_visit_events.sample(3)
| user | source | time | event | date | lifetime | id | diff | session_id | is_first_event_in_session | first_visit | event_week | first_visit_week | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 22590 | 5c36ed3f-b163-41b4-9720-9d5f3a47ca75 | 2019-10-16 20:16:27.569225 | tips_show | 2019-10-16 | 1 | 22590 | 0 days 00:03:32.439738 | 22586 | False | 2019-10-16 | 2019-10-14 | 2019-10-14 | |
| 19050 | 4bb26a10-b743-42a2-948e-1d959dc26df6 | yandex | 2019-10-31 21:29:18.947462 | tips_show | 2019-10-31 | 3 | 19050 | 0 days 00:01:32.758748 | 19042 | False | 2019-10-31 | 2019-10-28 | 2019-10-28 |
| 21207 | 56da535f-3ffa-4c4e-bdda-fc37c79cd653 | other | 2019-11-01 22:15:55.160180 | tips_show | 2019-11-01 | 2 | 21207 | 0 days 00:05:03.875194 | 21204 | False | 2019-10-31 | 2019-10-28 | 2019-10-28 |
# calculating the lifetime of users
first_visit_events['cohort_lifetime'] = first_visit_events['event_week'] - first_visit_events['first_visit_week']
first_visit_events['cohort_lifetime'] = first_visit_events['cohort_lifetime'] / np.timedelta64(1,'W')
first_visit_events['cohort_lifetime'] = first_visit_events['cohort_lifetime'].astype(int)
first_visit_events.sample(3)
| user | source | time | event | date | lifetime | id | diff | session_id | is_first_event_in_session | first_visit | event_week | first_visit_week | cohort_lifetime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 47580 | bf224017-4c82-46db-8a1e-0ec93ce65bdf | other | 2019-10-26 19:43:31.334213 | photos_show | 2019-10-26 | 1 | 47580 | 0 days 00:01:25.778453 | 47556 | False | 2019-10-26 | 2019-10-21 | 2019-10-21 | 0 |
| 61914 | f71341ae-9cb3-4747-a262-a52879262083 | yandex | 2019-10-20 11:40:34.764607 | advert_open | 2019-10-20 | 9 | 61914 | 0 days 00:00:03.567496 | 61912 | False | 2019-10-20 | 2019-10-14 | 2019-10-14 | 0 |
| 19671 | 4fef9c5d-9bb0-4df4-a476-54e92c44a4b0 | 2019-10-12 00:38:33.477258 | search | 2019-10-12 | 19 | 19671 | 0 days 00:01:55.017605 | 19668 | False | 2019-10-10 | 2019-10-07 | 2019-10-07 | 0 |
# grouping users by the week of the application's first use and lifetime
cohorts = first_visit_events.groupby(['first_visit_week','cohort_lifetime']).agg({'user':'nunique'}).reset_index()
cohorts
| first_visit_week | cohort_lifetime | user | |
|---|---|---|---|
| 0 | 2019-10-07 | 0 | 1089 |
| 1 | 2019-10-07 | 1 | 250 |
| 2 | 2019-10-07 | 2 | 152 |
| 3 | 2019-10-07 | 3 | 102 |
| 4 | 2019-10-14 | 0 | 1139 |
| 5 | 2019-10-14 | 1 | 273 |
| 6 | 2019-10-14 | 2 | 151 |
| 7 | 2019-10-21 | 0 | 1065 |
| 8 | 2019-10-21 | 1 | 233 |
| 9 | 2019-10-28 | 0 | 891 |
# counting the number of original cohort members
all_users = cohorts[cohorts['cohort_lifetime'] == 0][['first_visit_week','user']]
all_users = all_users.rename(columns={'user':'cohort_users'})
all_users
| first_visit_week | cohort_users | |
|---|---|---|
| 0 | 2019-10-07 | 1089 |
| 4 | 2019-10-14 | 1139 |
| 7 | 2019-10-21 | 1065 |
| 9 | 2019-10-28 | 891 |
# merging with cohorts
cohorts = cohorts.merge(all_users,on='first_visit_week')
cohorts
| first_visit_week | cohort_lifetime | user | cohort_users | |
|---|---|---|---|---|
| 0 | 2019-10-07 | 0 | 1089 | 1089 |
| 1 | 2019-10-07 | 1 | 250 | 1089 |
| 2 | 2019-10-07 | 2 | 152 | 1089 |
| 3 | 2019-10-07 | 3 | 102 | 1089 |
| 4 | 2019-10-14 | 0 | 1139 | 1139 |
| 5 | 2019-10-14 | 1 | 273 | 1139 |
| 6 | 2019-10-14 | 2 | 151 | 1139 |
| 7 | 2019-10-21 | 0 | 1065 | 1065 |
| 8 | 2019-10-21 | 1 | 233 | 1065 |
| 9 | 2019-10-28 | 0 | 891 | 891 |
# calculating retention rate by dividing active users by initial users
cohorts['retention'] = cohorts['user']/cohorts['cohort_users']
cohorts
| first_visit_week | cohort_lifetime | user | cohort_users | retention | |
|---|---|---|---|---|---|
| 0 | 2019-10-07 | 0 | 1089 | 1089 | 1.000000 |
| 1 | 2019-10-07 | 1 | 250 | 1089 | 0.229568 |
| 2 | 2019-10-07 | 2 | 152 | 1089 | 0.139578 |
| 3 | 2019-10-07 | 3 | 102 | 1089 | 0.093664 |
| 4 | 2019-10-14 | 0 | 1139 | 1139 | 1.000000 |
| 5 | 2019-10-14 | 1 | 273 | 1139 | 0.239684 |
| 6 | 2019-10-14 | 2 | 151 | 1139 | 0.132572 |
| 7 | 2019-10-21 | 0 | 1065 | 1065 | 1.000000 |
| 8 | 2019-10-21 | 1 | 233 | 1065 | 0.218779 |
| 9 | 2019-10-28 | 0 | 891 | 891 | 1.000000 |
# creating a separate table for retention by weeks and lifetime
retention = cohorts.pivot_table(index='first_visit_week',columns='cohort_lifetime',values='retention',aggfunc='sum')
retention
| cohort_lifetime | 0 | 1 | 2 | 3 |
|---|---|---|---|---|
| first_visit_week | ||||
| 2019-10-07 | 1.0 | 0.229568 | 0.139578 | 0.093664 |
| 2019-10-14 | 1.0 | 0.239684 | 0.132572 | NaN |
| 2019-10-21 | 1.0 | 0.218779 | NaN | NaN |
| 2019-10-28 | 1.0 | NaN | NaN | NaN |
# plotting retention
report = retention
report.T.plot(
grid=True,
xticks=list(report.columns.values),
figsize=(15, 5),
)
plt.xlabel('Lifetime')
plt.ylabel('Retention')
plt.title('Curves of retention based on acquisition days')
plt.show()
Let's look at the hitmap.
# plotting a hitmap
# since the retention rate at zero lifetime is always one, we excluded it from the calculation
plt.figure(figsize=(15, 6))
sns.heatmap(
retention.drop(columns=[0]),
annot=True,
fmt='.2%',
)
plt.title('Retention Heatmap')
plt.show()
Users who were attracted on October 14 showed the highest retention rates, as indicated by the light spots on the chart.
Conclusion:
It is necessary to generate clustering features in order to perform clustering.
# counting the number of events and sessions per user
mobile_clusters = df.pivot_table(index='user', aggfunc={'event':'count', 'session_id': 'nunique'})
mobile_clusters.columns = ['event_count', 'session_count']
mobile_clusters = mobile_clusters.sort_values(by='event_count', ascending = False).reset_index()
# merging mobile_clusters and user_days
mobile_clusters = mobile_clusters.merge(user_days, on = 'user')
# counting the number of events per user by types of these events
event = (df.pivot_table(index='user',
columns='event',
aggfunc={'event':'count'}).fillna(0))
event.columns = event.columns.map('_'.join)
event = event.reset_index()
# merging mobile_clusters and event
mobile_clusters = mobile_clusters.merge(event, on = 'user')
# counting the number of sources per user by types of these sources
sources = df.pivot_table(index='user', columns='source', aggfunc={'source':'count'}).fillna(0)
sources.columns = sources.columns.map('_'.join)
sources = sources.reset_index()
# merging mobile_clusters and sources
mobile_clusters = mobile_clusters.merge(sources, on = 'user')
# setting index
mobile_clusters = mobile_clusters.set_index('user')
# standardizing data
sc = StandardScaler()
# training sc and simultaneously transforming the matrix for the training sample
X_sc = sc.fit_transform(mobile_clusters)
# displaying on the screen
X_sc[:3]
array([[ 6.44168298, 10.43020263, 1.88451478, -0.26913889, 2.42224671,
1.91642089, -0.18336371, -0.41622581, 8.31092329, 15.77978449,
-0.19189254, -0.62727899, 10.73730025, -0.37531285, -0.48811893],
[ 6.32832187, 0.68113022, 0.88988731, 4.03405209, -0.16610222,
0.04020882, -0.18336371, 2.8318105 , -0.31123916, -0.1502913 ,
0.92476726, 6.98831964, -0.36116441, -0.37531285, 8.40845709],
[ 6.32832187, 5.75064787, 2.87914225, -0.26913889, 10.18729351,
3.16722894, -0.18336371, -0.41622581, 14.70801157, -0.42976631,
-0.19189254, -0.62727899, -0.36116441, 11.01348501, -0.48811893]])
# passing the standardized table as a parameter to the linkage() function
# to make the chart more representative, it is better to pass the value 'ward' to the method parameter
linked = linkage(X_sc, method = 'ward')
# plotting a dendrogram
plt.figure(figsize=(15, 10))
dendrogram(linked, orientation='top')
plt.title('Hierarchial clustering')
plt.show()
# setting the model k_means with the number of clusters 4
km = KMeans(n_clusters=4, random_state=0)
# predicting clusters for observations (the algorithm assigns them numbers from 0 to 2)
labels = km.fit_predict(X_sc)
# displaying on the screen
labels
array([2, 3, 2, ..., 1, 1, 1], dtype=int32)
# saving the cluster labels in our dataset
mobile_clusters['cluster'] = labels
# displaying on the screen
mobile_clusters.head(3)
| event_count | session_count | lifetime | event_advert_open | event_contacts_call | event_contacts_show | event_favorites_add | event_map | event_photos_show | event_search | event_tips_click | event_tips_show | source_google | source_other | source_yandex | cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| user | ||||||||||||||||
| 25069cad-0d00-48cb-a627-0871a877307e | 129 | 29 | 13 | 0.0 | 2.0 | 7.0 | 0.0 | 0.0 | 62.0 | 58.0 | 0.0 | 0.0 | 129.0 | 0.0 | 0.0 | 2 |
| 9c6c88b2-5d72-496c-9417-eb64f3454a73 | 127 | 4 | 8 | 19.0 | 0.0 | 1.0 | 0.0 | 6.0 | 0.0 | 1.0 | 1.0 | 99.0 | 0.0 | 0.0 | 127.0 | 3 |
| 62a5375a-eb94-4ed2-90ef-3d79d8e0c359 | 127 | 17 | 18 | 0.0 | 8.0 | 11.0 | 0.0 | 0.0 | 108.0 | 0.0 | 0.0 | 0.0 | 0.0 | 127.0 | 0.0 | 2 |
# calculating users in each cluster
mobile_clusters = mobile_clusters.reset_index()
mobile_clusters.groupby('cluster')['user'].count()
cluster 0 590 1 3294 2 64 3 236 Name: user, dtype: int64
There are 64 users in the smallest group and 3294 users in the largest group.
print('Silhouette_score: {:.2f}'.format(silhouette_score(X_sc, labels)))
Silhouette_score: 0.37
A silhouette score can range from -1 to 1. The closer to 1, the better the clustering. Based on the Silhouette_score = 0.37, the clustering went well.
# displaying feature mean values for each cluster
mobile_clusters.groupby('cluster').mean().T
| cluster | 0 | 1 | 2 | 3 |
|---|---|---|---|---|
| event_count | 17.961017 | 10.203704 | 65.640625 | 67.033898 |
| session_count | 4.183051 | 1.496964 | 12.765625 | 5.135593 |
| lifetime | 12.150847 | 1.546752 | 12.281250 | 7.224576 |
| event_advert_open | 0.776271 | 0.654827 | 2.437500 | 9.326271 |
| event_contacts_call | 0.364407 | 0.052823 | 2.312500 | 0.000000 |
| event_contacts_show | 1.376271 | 0.463874 | 4.062500 | 4.432203 |
| event_favorites_add | 0.511864 | 0.152398 | 2.640625 | 1.055085 |
| event_map | 0.659322 | 0.590468 | 0.046875 | 3.728814 |
| event_photos_show | 3.350847 | 1.459320 | 40.312500 | 0.000000 |
| event_search | 3.033898 | 1.004857 | 12.546875 | 2.250000 |
| event_tips_click | 0.145763 | 0.113540 | 0.031250 | 1.088983 |
| event_tips_show | 7.742373 | 5.711597 | 1.250000 | 45.152542 |
| source_google | 4.408475 | 2.945355 | 28.968750 | 14.436441 |
| source_other | 4.820339 | 2.821190 | 14.968750 | 18.711864 |
| source_yandex | 8.732203 | 4.437158 | 21.703125 | 33.885593 |
# calculating mean values for each feature
mobile_clusters.drop('cluster', axis = 1).mean()
event_count 15.351099 session_count 2.253346 lifetime 3.526530 event_advert_open 1.188337 event_contacts_call 0.128346 event_contacts_show 0.871415 event_favorites_add 0.292065 event_map 0.768881 event_photos_show 2.238050 event_search 1.537763 event_tips_click 0.171845 event_tips_show 8.154398 source_google 4.197897 source_other 4.185229 source_yandex 6.967973 dtype: float64
Let's look at the key metrics.
# counting users in each group
clusters = (mobile_clusters
.pivot_table(index='cluster', aggfunc={'event_advert_open':'count'})
.rename(columns={'event_advert_open':'users_count'}))
# adding other metrics
clusters = clusters.join(mobile_clusters.pivot_table(index='cluster',
values=['event_count', 'session_count', 'lifetime', 'event_advert_open',
'event_photos_show', 'event_search', 'event_contacts_show',
'event_tips_show', 'source_google', 'source_other', 'source_yandex']))
# setting the order of metrics
clusters[['users_count','event_count', 'session_count', 'lifetime',
'event_search', 'event_tips_show',
'event_advert_open', 'event_photos_show', 'event_contacts_show',
'source_google', 'source_yandex', 'source_other']].T
| cluster | 0 | 1 | 2 | 3 |
|---|---|---|---|---|
| users_count | 590.000000 | 3294.000000 | 64.000000 | 236.000000 |
| event_count | 17.961017 | 10.203704 | 65.640625 | 67.033898 |
| session_count | 4.183051 | 1.496964 | 12.765625 | 5.135593 |
| lifetime | 12.150847 | 1.546752 | 12.281250 | 7.224576 |
| event_search | 3.033898 | 1.004857 | 12.546875 | 2.250000 |
| event_tips_show | 7.742373 | 5.711597 | 1.250000 | 45.152542 |
| event_advert_open | 0.776271 | 0.654827 | 2.437500 | 9.326271 |
| event_photos_show | 3.350847 | 1.459320 | 40.312500 | 0.000000 |
| event_contacts_show | 1.376271 | 0.463874 | 4.062500 | 4.432203 |
| source_google | 4.408475 | 2.945355 | 28.968750 | 14.436441 |
| source_yandex | 8.732203 | 4.437158 | 21.703125 | 33.885593 |
| source_other | 4.820339 | 2.821190 | 14.968750 | 18.711864 |
Group 0
Despite being the second largest group (590 users), this is not the most active group. Users in this group make a small number of events (18) and sessions (4), active for 12 days. The vast majority of users see ads in Yandex recommendations, but seldom open them and do not view photos or contacts. There is a better situation than in group 1, but it is much worse than in groups 2 and 3. Advertisements may be saved to favorites and purchases delayed.
Group 1
This group has the most participants (3294) and is the most passive. There are 10 events in total, 1 session and 1 day of activity. A majority of users come from the recommendation system of all three search engines and rarely search for something themselves. Users open ads occasionally, look at photos a little bit, and rarely make the target event - viewing contacts - themselves.
Group 2
The smallest group of users (64). Users take part in 65 events and 12 sessions during a 12 day period. A lot of users search for ads on their own, look at photos and view contacts. Google and Yandex are the leading sources of traffic. These users clearly understand the product / service they are looking for and therefore most often only use search.
Group 3
It is the third largest (236) and most active group. Users perform more events in this groups (67), and quite a few sessions (5) are active within the past week. Most users open ads in Yandex recommendations, but rarely see photos. They use contacts as a target event. Number of contacts_shows per user is the winner.
# breaking down features by type "bool" and not "bool"
features = ['event_count', 'session_count', 'lifetime',
'event_search', 'event_tips_show',
'event_advert_open', 'event_photos_show', 'event_contacts_show']
features
['event_count', 'session_count', 'lifetime', 'event_search', 'event_tips_show', 'event_advert_open', 'event_photos_show', 'event_contacts_show']
# building feature distributions for clusters
for column in features:
plt.figure(figsize=(12, 4))
ax = sns.kdeplot(x=mobile_clusters[column], hue='cluster', data=mobile_clusters, palette='rainbow', legend = True)
ax.set_xlim(0,40)
plt.title(column)
plt.show()
The following features of clusters can be identified from the graph:
The first and second clusters have good event_count, session_count, and lifetime indicators, but poor event_advert_open, event_photos_show, and event_contacts_show.
In the time_diff table we calculated the time from the first to the target event in the time_delta column. Let's combine this table with mobile_clusters to see how the times differ between the events of users from different clusters.
# reseting mobile_clusters index
mobile_clusters = mobile_clusters.reset_index()
# calculating the average time from the first to the target event for each unique user
time = time_diff.groupby('user')['time_delta'].mean(numeric_only=False).reset_index()
time.head()
| user | time_delta | |
|---|---|---|
| 0 | 00157779-810c-4498-9e05-a1e9e3cedf93 | 0 days 00:15:29.808326333 |
| 1 | 00551e79-152e-4441-9cf7-565d7eb04090 | 0 days 00:02:24.522385 |
| 2 | 007d031d-5018-4e02-b7ee-72a30609173f | 0 days 00:05:42.504158 |
| 3 | 0103a07d-513f-42b9-8d91-d5891d5655fe | 0 days 00:07:04.858443 |
| 4 | 01556e76-d389-43bd-9fc7-1a3ba9802f49 | 0 days 00:00:26.363986 |
# merging time with mobile_clusters
time = mobile_clusters.merge(time, on = 'user').fillna(pd.Timedelta('0 days'))
time
| index | user | event_count | session_count | lifetime | event_advert_open | event_contacts_call | event_contacts_show | event_favorites_add | event_map | event_photos_show | event_search | event_tips_click | event_tips_show | source_google | source_other | source_yandex | cluster | time_delta | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 25069cad-0d00-48cb-a627-0871a877307e | 129 | 29 | 13 | 0.0 | 2.0 | 7.0 | 0.0 | 0.0 | 62.0 | 58.0 | 0.0 | 0.0 | 129.0 | 0.0 | 0.0 | 2 | 0 days 00:07:13.752840600 |
| 1 | 1 | 9c6c88b2-5d72-496c-9417-eb64f3454a73 | 127 | 4 | 8 | 19.0 | 0.0 | 1.0 | 0.0 | 6.0 | 0.0 | 1.0 | 1.0 | 99.0 | 0.0 | 0.0 | 127.0 | 3 | 0 days 00:45:30.036210 |
| 2 | 2 | 62a5375a-eb94-4ed2-90ef-3d79d8e0c359 | 127 | 17 | 18 | 0.0 | 8.0 | 11.0 | 0.0 | 0.0 | 108.0 | 0.0 | 0.0 | 0.0 | 0.0 | 127.0 | 0.0 | 2 | 0 days 00:03:30.922325 |
| 3 | 5 | 7a4806d3-b727-43f7-9edc-d5068869d95d | 124 | 7 | 11 | 0.0 | 0.0 | 4.0 | 0.0 | 6.0 | 0.0 | 0.0 | 10.0 | 104.0 | 0.0 | 0.0 | 124.0 | 3 | 0 days 00:00:02.125164 |
| 4 | 6 | f2c745ca-e1bf-4ea9-af47-ce48224c484c | 123 | 12 | 12 | 0.0 | 0.0 | 7.0 | 0.0 | 28.0 | 0.0 | 4.0 | 0.0 | 84.0 | 0.0 | 123.0 | 0.0 | 3 | 0 days 00:02:40.779818 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 719 | 4109 | 8b1ec6b6-01d2-4df5-b7e9-a0e9dd4f6242 | 3 | 1 | 1 | 0.0 | 0.0 | 1.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 1 | 0 days 00:00:47.369456 |
| 720 | 4115 | 0103a07d-513f-42b9-8d91-d5891d5655fe | 2 | 1 | 1 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 2.0 | 1 | 0 days 00:07:04.858443 |
| 721 | 4141 | 06ab04c0-c585-4f5c-9988-cd3d0e8ca7c3 | 2 | 1 | 1 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 1 | 0 days 00:04:34.220344 |
| 722 | 4148 | 73f42828-b55a-4b31-a2ef-aaed3569b3e8 | 2 | 1 | 1 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 | 1 | 0 days 00:00:13.102988 |
| 723 | 4158 | 8b949d31-8b91-4999-b1db-7015420e3f10 | 2 | 1 | 1 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 2.0 | 0.0 | 0.0 | 1 | 0 days 00:03:32.154449 |
724 rows × 19 columns
# grouping by cluster
time.groupby('cluster')['time_delta'].mean(numeric_only=False)
cluster 0 0 days 00:08:54.370492281 1 0 days 00:09:34.864186535 2 0 days 00:09:02.548041184 3 0 days 00:12:38.546377831 Name: time_delta, dtype: timedelta64[ns]
It takes 9:02 minutes for group 2 to reach the target event, while group 3 takes 12:38 minutes. Group leader 3 spends the most time since it performs more actions.
The time between the most inactive group 1 and the slightly more active group 0 is practically the same. Group 1 spends 9:34 minutes and group 0 spends 8:54 minutes.
# identifying users who have made the target event
con = mobile_clusters.query('event_contacts_show !=0').groupby('cluster')['user'].count().reset_index()
con.columns = ['cluster', 'user_contacts_show']
con
| cluster | user_contacts_show | |
|---|---|---|
| 0 | 0 | 197 |
| 1 | 1 | 614 |
| 2 | 2 | 40 |
| 3 | 3 | 98 |
# counting users
users = mobile_clusters.groupby('cluster')['user'].count().reset_index()
users.columns = ['cluster', 'user_count']
users
| cluster | user_count | |
|---|---|---|
| 0 | 0 | 590 |
| 1 | 1 | 3294 |
| 2 | 2 | 64 |
| 3 | 3 | 236 |
# calculating conversion rate
conversion = con.merge(users, on = 'cluster')
conversion['conversion'] = conversion['user_contacts_show'] / conversion['user_count'] * 100
conversion
| cluster | user_contacts_show | user_count | conversion | |
|---|---|---|---|---|
| 0 | 0 | 197 | 590 | 33.389831 |
| 1 | 1 | 614 | 3294 | 18.639951 |
| 2 | 2 | 40 | 64 | 62.500000 |
| 3 | 3 | 98 | 236 | 41.525424 |
The highest conversion rate was in group 2 (62%).
# merging first_visit_events and mobile_clusters
ret = first_visit_events.merge(mobile_clusters, on = 'user')
ret.sample(3)
| user | source | time | event | date | lifetime_x | id | diff | session_id | is_first_event_in_session | ... | event_favorites_add | event_map | event_photos_show | event_search | event_tips_click | event_tips_show | source_google | source_other | source_yandex | cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 15612 | 3bd2c291-242b-4bb9-8762-bb5f74b21acb | yandex | 2019-10-11 21:03:30.655534 | tips_show | 2019-10-11 | 3 | 15612 | 0 days 00:01:54.377195 | 15609 | False | ... | 0.0 | 1.0 | 0.0 | 4.0 | 0.0 | 7.0 | 0.0 | 0.0 | 12.0 | 1 |
| 40334 | a11eaf50-0096-4984-8a20-0a038b532e1c | yandex | 2019-10-14 10:58:49.876360 | tips_show | 2019-10-14 | 1 | 40334 | 0 days 00:00:23.470774 | 40327 | False | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 8.0 | 0.0 | 0.0 | 9.0 | 1 |
| 4043 | 0e2657c4-f800-49c2-a1d1-8856086246ac | other | 2019-10-19 20:12:40.636219 | search | 2019-10-19 | 1 | 4043 | NaT | 4043 | True | ... | 0.0 | 1.0 | 0.0 | 2.0 | 0.0 | 4.0 | 0.0 | 7.0 | 0.0 | 1 |
3 rows × 31 columns
Retention rate for cluster 0
# grouping users by the first week of using the application and lifetime
ret1 = ret.query('cluster == 0').groupby(['first_visit_week','cohort_lifetime']).agg({'user':'nunique'}).reset_index()
# counting users in the cohort
ret2 = ret1[ret1['cohort_lifetime'] == 0][['first_visit_week','user']]
ret2 = ret2.rename(columns={'user':'cohort_users'})
# merging ret1 with ret2
ret1 = ret1.merge(ret2,on='first_visit_week')
# calculating the retention rate by dividing active users by the initial users
ret1['retention'] = ret1['user']/ret1['cohort_users']
# saving in a separate table
ret3 = ret1.pivot_table(index='first_visit_week',columns='cohort_lifetime',values='retention',aggfunc='sum')
ret3
| cohort_lifetime | 0 | 1 | 2 | 3 |
|---|---|---|---|---|
| first_visit_week | ||||
| 2019-10-07 | 1.0 | 0.564103 | 0.491453 | 0.307692 |
| 2019-10-14 | 1.0 | 0.674419 | 0.544186 | NaN |
| 2019-10-21 | 1.0 | 0.864407 | NaN | NaN |
| 2019-10-28 | 1.0 | NaN | NaN | NaN |
The users aren't inclined to return, so they mostly see Yandex recommendations with ads and are unlikely to search for something on their own.
Retention rate for cluster 1
# grouping users by the first week of using the application and lifetime
ret1 = ret.query('cluster == 1').groupby(['first_visit_week','cohort_lifetime']).agg({'user':'nunique'}).reset_index()
# counting users in the cohort
ret2 = ret1[ret1['cohort_lifetime'] == 0][['first_visit_week','user']]
ret2 = ret2.rename(columns={'user':'cohort_users'})
# merging ret1 with ret2
ret1 = ret1.merge(ret2,on='first_visit_week')
# calculating the retention rate by dividing active users by the initial users
ret1['retention'] = ret1['user']/ret1['cohort_users']
# saving in a separate table
ret3 = ret1.pivot_table(index='first_visit_week',columns='cohort_lifetime',values='retention',aggfunc='sum')
ret3
| cohort_lifetime | 0 | 1 | 2 |
|---|---|---|---|
| first_visit_week | |||
| 2019-10-07 | 1.0 | 0.077939 | 0.002642 |
| 2019-10-14 | 1.0 | 0.083933 | NaN |
| 2019-10-21 | 1.0 | 0.095622 | NaN |
| 2019-10-28 | 1.0 | NaN | NaN |
After four weeks, Group 1 (the most passive group) did not return.
Retention rate for cluster 2
# grouping users by the first week of using the application and lifetime
ret1 = ret.query('cluster == 2').groupby(['first_visit_week','cohort_lifetime']).agg({'user':'nunique'}).reset_index()
# counting users in the cohort
ret2 = ret1[ret1['cohort_lifetime'] == 0][['first_visit_week','user']]
ret2 = ret2.rename(columns={'user':'cohort_users'})
# merging ret1 with ret2
ret1 = ret1.merge(ret2,on='first_visit_week')
# calculating the retention rate by dividing active users by the initial users
ret1['retention'] = ret1['user']/ret1['cohort_users']
# saving in a separate table
ret3 = ret1.pivot_table(index='first_visit_week',columns='cohort_lifetime',values='retention',aggfunc='sum')
ret3
| cohort_lifetime | 0 | 1 | 2 | 3 |
|---|---|---|---|---|
| first_visit_week | ||||
| 2019-10-07 | 1.0 | 0.947368 | 0.736842 | 0.526316 |
| 2019-10-14 | 1.0 | 0.863636 | 0.590909 | NaN |
| 2019-10-21 | 1.0 | 0.882353 | NaN | NaN |
| 2019-10-28 | 1.0 | NaN | NaN | NaN |
Group 2 has the highest retention rate, perhaps because users like to search for things on their own.
Retention rate for cluster 3
# grouping users by the first week of using the application and lifetime
ret1 = ret.query('cluster == 3').groupby(['first_visit_week','cohort_lifetime']).agg({'user':'nunique'}).reset_index()
# counting users in the cohort
ret2 = ret1[ret1['cohort_lifetime'] == 0][['first_visit_week','user']]
ret2 = ret2.rename(columns={'user':'cohort_users'})
# merging ret1 with ret2
ret1 = ret1.merge(ret2,on='first_visit_week')
# calculating the retention rate by dividing active users by the initial users
ret1['retention'] = ret1['user']/ret1['cohort_users']
# saving in a separate table
ret3 = ret1.pivot_table(index='first_visit_week',columns='cohort_lifetime',values='retention',aggfunc='sum')
ret3
| cohort_lifetime | 0 | 1 | 2 | 3 |
|---|---|---|---|---|
| first_visit_week | ||||
| 2019-10-07 | 1.0 | 0.518987 | 0.265823 | 0.253165 |
| 2019-10-14 | 1.0 | 0.573529 | 0.308824 | NaN |
| 2019-10-21 | 1.0 | 0.532258 | NaN | NaN |
| 2019-10-28 | 1.0 | NaN | NaN | NaN |
It is interesting to note that users are less inclined to return to group 3 despite the fact that it is one of the leading groups. This is perhaps because they are more likely to view ads through search engine recommendations and not interested in searching independently.
Conclusion:
Motivation recommendations for users in groups 0 and 1:
The ad serving system needs to be improved. Users see ads in all search engines, but are not interested in looking at photos or even contacts. Users are not shown what is relevant them. It is necessary to find out what interests them and build algorithms based on this information.
Some users installed the application using a link from yandex, others from google. Test the hypothesis: in both groups, conversions to contact views have statistically significant differences.
Test the hypothesis: site search conversions to contact views and recommender conversions to contact views have statistically significant differences.
Let's formulate the null and alternative hypotheses:
# saving users coming from yandex
yandex = df.query('source =="yandex" and event == "contacts_show"')
# saving users coming from google
google = df.query('source =="google" and event == "contacts_show"')
# counting unique users
users_by_source = df.query('source !="other"').groupby('source')['user'].nunique()
users_by_source
source google 1107 yandex 1882 Name: user, dtype: int64
# creating a dataframe with the number of users and contacts_show events
users_by_events = df.query('event == "contacts_show" and source !="other"').pivot_table(index='source',columns='event', values='user', aggfunc='nunique').reset_index()
users_by_events['users'] = users_by_events['source'].apply(lambda x: users_by_source.loc[x])
users_by_events = users_by_events.set_index('source')
users_by_events.columns = ['contacts_show', 'users']
users_by_events
| contacts_show | users | |
|---|---|---|
| source | ||
| 264 | 1107 | |
| yandex | 467 | 1882 |
# calculating conversion rate of users who installed the application via a link from google
google_conversion = users_by_events.query('source == "google"')
google_conversion = round(google_conversion['contacts_show'] / google_conversion['users'] * 100, 2)
google_conversion
source google 23.85 dtype: float64
# counting conversion rate of users who installed the application via a link from yandex
yandex_conversion = users_by_events.query('source == "yandex"')
yandex_conversion = round(yandex_conversion['contacts_show'] / yandex_conversion['users'] * 100, 2)
yandex_conversion
source yandex 24.81 dtype: float64
# creating a function to test the hypotheses
def testing_hypothesis(contacts_show, users, alpha):
# proportion of successes in the first group:
p1 = contacts_show[0]/users[0]
# proportion of successes in the second group:
p2 = contacts_show[1]/users[1]
# proportion of successes in the combined dataset:
p_combined = (contacts_show[0] + contacts_show[1]) / (users[0] + users[1])
# difference in proportions
difference = p1 - p2
# z-value
z_value = difference / mth.sqrt(p_combined * (1 - p_combined) * (1/users[0] + 1/users[1]))
# standard normal distribution (mean 0, standard deviation 1)
distr = st.norm(0, 1)
# p-value
p_value = (1 - distr.cdf(abs(z_value))) * 2
print('p-value: ', p_value)
if p_value < alpha:
print('We reject the null hypothesis: there are statistically significant differences between the conversions of users who installed the app via a link from yandex and users who installed the app via a link from google.')
else:
print(
'We failed to reject the null hypothesis, there are no statistically significant differences between the conversions of users who installed the application via a link from yandex and users who installed the application via a link from google.')
# separating contacts_show from users
contacts_show = users_by_events['contacts_show']
users = users_by_events['users']
# testing the hypothesis
testing_hypothesis(contacts_show, users, .05)
p-value: 0.5530408241544031 We failed to reject the null hypothesis, there are no statistically significant differences between the conversions of users who installed the application via a link from yandex and users who installed the application via a link from google.
At a given critical level of statistical significance, there are no statistically significant differences between the conversions of users who installed the application via a link from yandex and users who installed the application via a link from google. The source has no effect on the target event's views.
# counting unique users tips_show
tips_show_users = df.query('event =="tips_show"')['user'].unique().tolist()
# counting unique users search
search_users = df.query('event =="search"')['user'].unique().tolist()
print('Number of unique users tips_show =', len(tips_show_users))
print('Number of unique users search =', len(search_users))
Number of unique users tips_show = 2731 Number of unique users search = 1632
# calculating tips_show conversion
tips_show_events = df.query('user==@tips_show_users')[['user', 'event']]
tips_show_contacts = tips_show_events[tips_show_events['event']=="contacts_show"]['user'].nunique()
tips_show_contacts_conversion = round(tips_show_contacts / len(tips_show_users) * 100, 2)
print('Total unique tips_show users who completed contacts_show =', tips_show_contacts)
print(f'Conversion to contacts_show for users tips_show = {tips_show_contacts_conversion}%')
Total unique tips_show users who completed contacts_show = 492 Conversion to contacts_show for users tips_show = 18.02%
# calculating search conversion
search_events = df.query('user==@search_users')[['user', 'event']]
search_contacts = search_events[search_events['event']=="contacts_show"]['user'].nunique()
search_contacts_conversion = round(search_contacts / len(search_users) * 100, 2)
print('Total unique search users who completed contacts_show =', search_contacts)
print(f'Conversion to contacts_show for search users = {search_contacts_conversion}%')
Total unique search users who completed contacts_show = 362 Conversion to contacts_show for search users = 22.18%
What we have:
Total unique tips_show users who completed contacts_show = 492
Total unique search users = 1640
Let's formulate the null and alternative hypotheses:
# creating a function to test the hypotheses
def testing_hypothesis(successes, trials, alpha):
# proportion of successes in the first group:
p1 = successes[0]/trials[0]
# proportion of successes in the second group:
p2 = successes[1]/trials[1]
# proportion of successes in the combined dataset:
p_combined = (successes[0] + successes[1]) / (trials[0] + trials[1])
# difference in proportions
difference = p1 - p2
# z_value
z_value = difference / mth.sqrt(p_combined * (1 - p_combined) * (1/trials[0] + 1/trials[1]))
# standard normal distribution (mean 0, standard deviation 1)
distr = st.norm(0, 1)
# p-value
p_value = (1 - distr.cdf(abs(z_value))) * 2
print('p-value: ', p_value)
if p_value < alpha:
print('We reject the null hypothesis: there are statistically significant differences between conversion rates of site search (search) to contact views and recommendation system (tips_show) to contact views.')
else:
print(
'We failed to reject the null hypothesis, there are no statistically significant differences between conversion rates of site search (search) to contact views and recommendation system (tips_show) to contact views.')
# adding values to variables
successes = [tips_show_contacts, search_contacts]
trials = [len(tips_show_users), len(search_users)]
# displaying on the screen
successes
[492, 362]
# displaying on the screen
trials
[2731, 1632]
# testing the hypothesis
testing_hypothesis(successes, trials, .05)
p-value: 0.0007910164571709455 We reject the null hypothesis: there are statistically significant differences between conversion rates of site search (search) to contact views and recommendation system (tips_show) to contact views.
At a given critical level of statistical significance, there are statistically significant differences between conversion rates of site search (search) to contact views and recommendation system (tips_show) to contact views.
Conclusion:
Exploratory data analysis:
User segmentation:
Hypotheses testing:
Recommendations:
The ad serving system needs to be improved. Users see ads in all search engines, but are not interested in looking at photos or even contacts. Users are not shown what is relevant them. It is necessary to find out what interests them and build algorithms based on this information.